# Ven mun data ####

library(tidyverse)
library(readxl)

## Wikipedia Municipal size data ####

# Define custom replacements for accented characters
remove_accents <- function(text) {
  text %>%
    str_replace_all(c(
      "á" = "a", "é" = "e", "í" = "i", "ó" = "o", "ú" = "u",
      "Á" = "A", "É" = "E", "Í" = "I", "Ó" = "O", "Ú" = "U"
    ))
}


mun_size <- read_excel("data/VEN_extension_municipios.xlsx")

names(mun_size)

mun_size <- mun_size %>% mutate(
  municipio = tolower(Municipio),
  municipio = remove_accents(municipio),
  estado =  toupper(estado),
  estado = remove_accents(estado),
  mun_edo = paste(estado, municipio, sep = "_")
)

mun_size <- mun_size %>% select(mun_edo, superficie_km2, mun_size_hab)


## Aggregate at municipal level ####

ven <- readRDS("ven_elec_2006_2024_final.rds")

ven_mun <- ven %>%
  group_by(year,cod_edo, estado, cod_mun, municipio ) %>%
  summarise(
    # Population should sum unique values per Parroquia to avoid duplication
    pob_2011 = sum(unique(pob_2011), na.rm = TRUE),
    pob_proy_2020 = sum(unique(pob_proy_2020), na.rm = TRUE),
    
    # Weighted means for percentages (computed directly in summarise)
    of_p = round(weighted.mean(of_p, validos, na.rm = TRUE), 2),
    op_p = round(weighted.mean(op_p, validos, na.rm = TRUE), 2),
    otro_p = round(weighted.mean(otro_p, validos, na.rm = TRUE), 2),
    turnout = weighted.mean(turnout, rep_c, na.rm = TRUE),
    
    # Sum other numeric variables (excluding already computed ones)
    across(where(is.numeric) & !c(pob_2011, pob_proy_2020, of_p, op_p, otro_p, turnout), \(x) sum(x, na.rm = TRUE)),
    
    # Create municipio-estado identifier
    mun_edo = first(paste(estado, municipio, sep = "_")),
    .groups = "drop"
  ) %>% 
  group_by(mun_edo) %>%
  mutate(
    pob_2011 = max(pob_2011, na.rm = TRUE),
    pob_proy_2020 = max(pob_proy_2020, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  left_join(., mun_size, by = "mun_edo")

## Create urban-rural variables following UN/OCDE methodology ####

ven_mun <- ven_mun %>% 
  mutate(
    densidad_hab_km2 = pob_2011/superficie_km2,
    urb_level_ocde_cat = case_when(
      pob_2011 >= 50000 & densidad_hab_km2 >= 1500 ~ "urban",
      pob_2011 > 5000 & densidad_hab_km2 > 299 ~ "intermediate",
      pob_2011 > 5000 & densidad_hab_km2 < 300 ~ "rural",
      pob_2011 < 5000 ~ "rural"
    ),
    urb_level_ocde_bi = case_when(
      densidad_hab_km2 >= 150 ~ "urban",
      densidad_hab_km2 < 150 ~ "rural"
    )
  )

## Save final dataset ####

saveRDS(ven_mun, "ven_mun_elec_2006_2024_final.rds")
write.csv(ven_mun, "ven_mun_elec_2006_2024.csv", row.names = F)

saveRDS(ven_mun, file = "ven_mun_2006_2024.RData")
